import numpy as np from numpy import linalg as la np.random.seed(42)
defflip_signs(A, B): """ utility function for resolving the sign ambiguity in SVD http://stats.stackexchange.com/q/34396/115202 """ signs = np.sign(A) * np.sign(B) return A, B * signs
# Let the data matrix X be of n x p size, # where n is the number of samples and p is the number of variables n, p = 5, 3 X = np.random.rand(n, p) # Let us assume that it is centered X -= np.mean(X, axis=0)
# the p x p covariance matrix C = np.cov(X, rowvar=False) print ("C = \n", C) # C is a symmetric matrix and so it can be diagonalized: l, principal_axes = la.eig(C) # sort results wrt. eigenvalues idx = l.argsort()[::-1] l, principal_axes = l[idx], principal_axes[:, idx] # the eigenvalues in decreasing order print ("l = \n", l) # a matrix of eigenvectors (each column is an eigenvector) print ("V = \n", principal_axes) # projections of X on the principal axes are called principal components principal_components = X.dot(principal_axes) print ("Y = \n", principal_components)
# we now perform singular value decomposition of X # "economy size" (or "thin") SVD U, s, Vt = la.svd(X, full_matrices=False) V = Vt.T S = np.diag(s) print ("V2 = \n", V ) # 1) then columns of V are principal directions/axes. assert np.allclose(*flip_signs(V, principal_axes))
# 2) columns of US are principal components assert np.allclose(*flip_signs(U.dot(S), principal_components))
# 3) singular values are related to the eigenvalues of covariance matrix assert np.allclose((s ** 2) / (n - 1), l)